Set working directory

setwd("~/Documents/Metagenomes_AMRIWA/R/AMRIWA/RFiles")

Load libraries

library(phyloseq)
library(dplyr)
library(viridis)
library(stringr)
library(vegan)
library(RColorBrewer)
library(BiocManager)
BiocManager::install("microbiome")
library(microbiome)
library(microbiomeutilities)
library(ggplot2)
library(knitr)
library(ggpubr)
library(pheatmap)
library(ape)
library(multcomp)

Load metadata

metadata <-read.table("metadata.txt", sep="\t", header = T, row.names = 1, fill = 1)

Metaxa2

# Load data
metaxa_genus <- read.delim("~/Documents/Metagenomes_AMRIWA/R/AMRIWA/RFiles/metaxa_genus.txt")

# Create OTU table
OTU_metaxa <- metaxa_genus[,-1]

# Create tax table
tax_table_metaxa <- data.frame(str_split_fixed(data.frame(metaxa_genus) [,1], ";", 6))
colnames(tax_table_metaxa) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus")

# Combine into phyloseq object
metaxa_PHY <- phyloseq(otu_table(OTU_metaxa, taxa_are_rows=TRUE), 
                       tax_table(as.matrix(tax_table_metaxa)), sample_data(metadata))

# Exclude Unknown
metaxa_PHY <- subset_taxa(metaxa_PHY, !Domain %in% c("Unknown"))

Add SSU counts into metadata

metadata$SSU_counts <- sample_sums(metaxa_PHY)

# Create extra column for Eucaryota, might be interesting to check out later
metaxa_PHY_temp <- subset_taxa(metaxa_PHY, !Domain %in% c("Unknown", "Bacteria", "Archaea", "Mitochondria", "Chloroplast"))
metadata$SSU_eucaryota <- sample_sums(metaxa_PHY_temp)

Filter samples

# With low quality (BFH24_S142, BH63_S118, FH10_S171), do at least this.
metaxa_PHY = subset_samples(metaxa_PHY, name != "BFH24_S142" & name != "BH63_S118" & name != "FH10_S171")

# Feces samples (BH20_S88, BH22_S89, BH24_S90, BH25_S91)
metaxa_PHY_ww = subset_samples(metaxa_PHY, name!="BH20_S88" & name!="BH22_S89" & name!="BH24_S90" & name!="BH25_S91")

# Or alternatively include fecal but exclude suspicious samples
# metaxa_PHY_ast = subset_samples(metaxa_PHY, "BH02_S77" & name != "BH27_S92" & name != "BH30_S94")

# Suspicious samples (very high ARG level)
metaxa_PHY_clean = subset_samples(metaxa_PHY_ww, name != "BH02_S77" & name != "BH27_S92" & name != "BH30_S94")

# Clear taxa that are not present in any of the remaining samples
any(taxa_sums(metaxa_PHY_clean) == 0)
## [1] TRUE
metaxa_PHY_clean <- prune_taxa(taxa_sums(metaxa_PHY_clean) > 0, metaxa_PHY_clean)
any(taxa_sums(metaxa_PHY_clean) == 0)
## [1] FALSE

Ordination to study whether the taxonomy can be explained by the variable “country”, Metxa2

# Change into relative data
metaxa_rel_PHY <- transform_sample_counts(metaxa_PHY_clean, function(x) x/sum(x))

# Ordinate and plot data
metaxa_rel_PHY_ord <- ordinate(metaxa_rel_PHY, method = "PCoA", distance = "horn")
p1 <- plot_ordination(metaxa_rel_PHY, metaxa_rel_PHY_ord, color = "country", label = "name")
metaxa.p1 <- p1 + scale_color_manual(values=c("#FF333F", "#35E0F5", "#531592")) + geom_point(colour = "black", 
    pch = 21, size = 2, alpha = 0.5) + stat_ellipse(level = 0.95, linetype = 1) +
    theme_minimal() + labs(title = "Metaxa2")
metaxa.p1

# Test significance using pair-wise adonis
metaxa_temp <- subset_samples(metaxa_PHY_clean, (country == "Benin" | country == "Finland"))
metaxa_dist <- vegdist(t(otu_table(metaxa_temp)), dist = "horn")
adonis(metaxa_dist ~ country, data = data.frame(sample_data(metaxa_temp), permutations = 9999))
## 
## Call:
## adonis(formula = metaxa_dist ~ country, data = data.frame(sample_data(metaxa_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## country    1    1.9647 1.96468  9.2344 0.16719  0.001 ***
## Residuals 46    9.7868 0.21276         0.83281           
## Total     47   11.7515                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
metaxa_temp <- subset_samples(metaxa_PHY_clean, (country == "Burkina Faso" | country == "Finland"))
metaxa_dist <- vegdist(t(otu_table(metaxa_temp)), dist = "horn")
adonis(metaxa_dist ~ country, data = data.frame(sample_data(metaxa_temp), permutations = 9999))
## 
## Call:
## adonis(formula = metaxa_dist ~ country, data = data.frame(sample_data(metaxa_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## country    1    1.0339 1.03386  4.7505 0.09549  0.001 ***
## Residuals 45    9.7935 0.21763         0.90451           
## Total     46   10.8274                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
metaxa_temp <- subset_samples(metaxa_PHY_clean, (country == "Benin" | country == "Burkina Faso"))
metaxa_dist <- vegdist(t(otu_table(metaxa_temp)), dist = "horn")
adonis(metaxa_dist ~ country, data = data.frame(sample_data(metaxa_temp), permutations = 9999))
## 
## Call:
## adonis(formula = metaxa_dist ~ country, data = data.frame(sample_data(metaxa_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## country    1    0.8852 0.88524  3.8733 0.04911  0.001 ***
## Residuals 75   17.1409 0.22855         0.95089           
## Total     76   18.0262                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Finnish samples seems to harbour more distinct taxonomic dibversity than those of Benin (R2=16.7%, p=0.001) and Burkina Faso (R2=9.6 %, p=0.001). Instead There is little grouping by country between Benin and Burkina Faso.

Is taxonomic composition is grouped significantly by location inside each country, Metaxa2

Benin <- subset_samples(metaxa_PHY_clean, (country == "Benin"))
Benin_dist <- vegdist(t(otu_table(Benin)), dist = "horn")
adonis(Benin_dist ~ location, data = data.frame(sample_data(Benin)), permutations = 99999)
## 
## Call:
## adonis(formula = Benin_dist ~ location, data = data.frame(sample_data(Benin)),      permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model     R2 Pr(>F)    
## location   4    2.1829 0.54572  2.9063 0.2548  1e-05 ***
## Residuals 34    6.3842 0.18777         0.7452           
## Total     38    8.5671                 1.0000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
BF <- subset_samples(metaxa_PHY_clean, (country == "Burkina Faso"))
BF_dist <- vegdist(t(otu_table(BF)), dist = "horn")
adonis(BF_dist ~ location, data = data.frame(sample_data(BF)), permutations = 99999)
## 
## Call:
## adonis(formula = BF_dist ~ location, data = data.frame(sample_data(BF)),      permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## location   4    2.0796 0.51989  2.6418 0.24255  1e-05 ***
## Residuals 33    6.4943 0.19680         0.75745           
## Total     37    8.5738                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Finland <- subset_samples(metaxa_PHY_clean, (country == "Finland"))
Finland_dist <- vegdist(t(otu_table(Finland)), dist = "horn")
adonis(Finland_dist ~ location, data = data.frame(sample_data(Finland)), permutations = 99999)
## 
## Call:
## adonis(formula = Finland_dist ~ location, data = data.frame(sample_data(Finland)),      permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)
## location   3   0.59080 0.19693  1.5658 0.48439 0.1495
## Residuals  5   0.62888 0.12578         0.51561       
## Total      8   1.21968                 1.00000
# In Benin the location defines the taxonomic composition at some level (R2=25.5 %, p=1e-05), as well as in Burkina Faso (R2=24.3 %, p=1e-05). In Finland grouping by location is not significant (due to small sample size).

Metaphlan3

# Modify data in command-line to match sample names in metadata file
## tr '|' ';' <merged_abundance_table.txt > mod_merged_abundance_table.txt
## sed -i 's/_profile//g' mod_merged_abundance_table.txt
## sed -i 's/BFH38-A_S156/BFH38.A_S156/g' mod_merged_abundance_table.txt
## sed -i 's/BFH38-B_S157/BFH38.B_S157/g' mod_merged_abundance_table.txt
## sed -i 's/BH34-A_S98/BH34.A_S98/g' mod_merged_abundance_table.txt
## sed -i 's/BH34-B_S99/BH34.B_S99/g' mod_merged_abundance_table.txt

# Load data
metaphlan <- as.matrix(read.table("mod_merged_abundance_table.txt", fill = 1, header = T, check.names = F))

# Exclude the first column "NCBI_tax_id"
metaphlan <- subset(metaphlan, select = -c(NCBI_tax_id))

# Create OTU table
OTU_metaphlan <- metaphlan[,-1]

# Change values as numeric
class(OTU_metaphlan) <- "numeric"

# Create tax_table
tax_table_metaphlan <- data.frame(str_split_fixed(data.frame(metaphlan) [,1], ";", 7))
colnames(tax_table_metaphlan) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")

# Clean "*__"
tax_table_metaphlan <- apply(tax_table_metaphlan, 2, function(y) (gsub(".__", "", y)))

# Combine into phyloseq object
metaphlan_PHY <- phyloseq(otu_table(OTU_metaphlan, taxa_are_rows = TRUE), tax_table(as.matrix(tax_table_metaphlan)), sample_data(metadata))

Filter samples

# Virus
metaphlan_PHY <- subset_taxa(metaphlan_PHY, Kingdom != "Virus")

# With low quality (BFH24_S142, BH63_S118, FH10_S171)
metaphlan_PHY = subset_samples(metaphlan_PHY, name != "BFH24_S142" & name != "BH63_S118" & name != "FH10_S171")

# Feces samples (BH20_S88, BH22_S89, BH24_S90, BH25_S91)
metaphlan_PHY_ww = subset_samples(metaphlan_PHY, name!="BH20_S88" & name!="BH22_S89" & name!="BH24_S90" & name!="BH25_S91")

# Or alternatively include fecal but exclude suspicious samples
# metaphlan_PHY_ast = subset_samples(metaphlan_PHY, "BH02_S77" & name != "BH27_S92" & name != "BH30_S94")

# Suspicious samples (very high ARG level)
metaphlan_PHY_clean = subset_samples(metaphlan_PHY_ww, name != "BH02_S77" & name != "BH27_S92" & name != "BH30_S94")

# Clear taxa that is not present in any of the remaining samples
any(taxa_sums(metaphlan_PHY_clean) == 0)
## [1] TRUE
metaphlan_PHY_clean <- prune_taxa(taxa_sums(metaphlan_PHY_clean) > 0, metaphlan_PHY_clean)
any(taxa_sums(metaphlan_PHY_clean) == 0)
## [1] FALSE

Ordination to study wheather the taxonomy can be explained by the variable “country”, Metaphlan3

# Ordinate and plot data
metaphlan_PHY_ord <- ordinate(metaphlan_PHY_clean, method = "PCoA", distance = "horn")
p2 <- plot_ordination(metaphlan_PHY_clean, metaphlan_PHY_ord, color = "country", label = "sample_type")
metaphlan.p2 <- p2 + scale_color_manual(values=c("#FF333F", "#35E0F5", "#531592")) + 
    geom_point(colour = "black", pch = 21, size = 2, alpha = 0.5) + stat_ellipse(level = 0.95, linetype = 1) +
    theme_minimal() + labs(title = "Metaphlan3")
metaphlan.p2

# Test significance using pair-wise adonis
metaphlan_temp <- subset_samples(metaphlan_PHY_clean, (country == "Benin" | country == "Finland"))
metaphlan_dist <- vegdist(t(otu_table(metaphlan_temp)), dist = "horn")
adonis(metaphlan_dist ~ country, data = data.frame(sample_data(metaphlan_temp), permutations = 9999))
## 
## Call:
## adonis(formula = metaphlan_dist ~ country, data = data.frame(sample_data(metaphlan_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)   
## country    1    0.6468 0.64678  3.6939 0.07433  0.005 **
## Residuals 46    8.0544 0.17510         0.92567          
## Total     47    8.7012                 1.00000          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
metaphlan_temp <- subset_samples(metaphlan_PHY_clean, (country == "Burkina Faso" | country == "Finland"))
metaphlan_dist <- vegdist(t(otu_table(metaphlan_temp)), dist = "horn")
adonis(metaphlan_dist ~ country, data = data.frame(sample_data(metaphlan_temp), permutations = 9999))
## 
## Call:
## adonis(formula = metaphlan_dist ~ country, data = data.frame(sample_data(metaphlan_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model     R2 Pr(>F)  
## country    1    0.3728 0.37282  2.5133 0.0529  0.017 *
## Residuals 45    6.6753 0.14834         0.9471         
## Total     46    7.0481                 1.0000         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
metaphlan_temp <- subset_samples(metaphlan_PHY_clean, (country == "Burkina Faso" | country == "Benin"))
metaphlan_dist <- vegdist(t(otu_table(metaphlan_temp)), dist = "horn")
adonis(metaphlan_dist ~ country, data = data.frame(sample_data(metaphlan_temp), permutations = 9999))
## 
## Call:
## adonis(formula = metaphlan_dist ~ country, data = data.frame(sample_data(metaphlan_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## country    1    0.7877 0.78768  4.4682 0.05623  0.001 ***
## Residuals 75   13.2216 0.17629         0.94377           
## Total     76   14.0093                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The explanatory variable "country" explains less than 10 % of the grouping in every pair-wise comparison.
# There seems to be a small group of samples differing from other ones. They represent mostly other than ww samples. Let's see if this grouping is significant.

metaphlan_temp <- subset_samples(metaphlan_PHY_clean, (sample_type == "waste water" | sample_type == "soil"))
metaphlan_dist <- vegdist(t(otu_table(metaphlan_temp)), dist = "horn")
adonis(metaphlan_dist ~ sample_type, data = data.frame(sample_data(metaphlan_temp), permutations = 9999))
## 
## Call:
## adonis(formula = metaphlan_dist ~ sample_type, data = data.frame(sample_data(metaphlan_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##             Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)
## sample_type  1    0.2356 0.23558  1.4281 0.01893  0.152
## Residuals   74   12.2075 0.16497         0.98107       
## Total       75   12.4431                 1.00000
metaphlan_temp <- subset_samples(metaphlan_PHY_clean, (sample_type == "waste water" | sample_type == "river water"))
metaphlan_dist <- vegdist(t(otu_table(metaphlan_temp)), dist = "horn")
adonis(metaphlan_dist ~ sample_type, data = data.frame(sample_data(metaphlan_temp), permutations = 9999))
## 
## Call:
## adonis(formula = metaphlan_dist ~ sample_type, data = data.frame(sample_data(metaphlan_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##             Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)   
## sample_type  1    0.6073 0.60729  3.6779 0.04616  0.002 **
## Residuals   76   12.5493 0.16512         0.95384          
## Total       77   13.1566                 1.00000          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pair-wise comparison between ww samples and river water gives a significant (p=0.002) R2 value of 4.6 %.

Is taxonomic composition is grouped significantly by location inside each country, Metaphlan3

Benin <- subset_samples(metaphlan_PHY_clean, (country == "Benin"))
Benin_dist <- vegdist(t(otu_table(Benin)), dist = "horn")
adonis(Benin_dist ~ location, data = data.frame(sample_data(Benin)), permutations = 99999)
## 
## Call:
## adonis(formula = Benin_dist ~ location, data = data.frame(sample_data(Benin)),      permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## location   4    2.1667 0.54168  3.5875 0.29679  1e-05 ***
## Residuals 34    5.1337 0.15099         0.70321           
## Total     38    7.3004                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
BF <- subset_samples(metaphlan_PHY_clean, (country == "Burkina Faso"))
BF_dist <- vegdist(t(otu_table(BF)), dist = "horn")
adonis(BF_dist ~ location, data = data.frame(sample_data(BF)), permutations = 99999)
## 
## Call:
## adonis(formula = BF_dist ~ location, data = data.frame(sample_data(BF)),      permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## location   4    1.3162 0.32904  2.3579 0.22228  6e-05 ***
## Residuals 33    4.6050 0.13955         0.77772           
## Total     37    5.9212                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Finland <- subset_samples(metaphlan_PHY_clean, (country == "Finland"))
Finland_dist <- vegdist(t(otu_table(Finland)), dist = "horn")
adonis(Finland_dist ~ location, data = data.frame(sample_data(Finland)), permutations = 99999)
## 
## Call:
## adonis(formula = Finland_dist ~ location, data = data.frame(sample_data(Finland)),      permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
## location   3   0.38042 0.126806  1.6969 0.50449 0.1398
## Residuals  5   0.37364 0.074728         0.49551       
## Total      8   0.75406                  1.00000
# The results seem similar to those from Metaxa2. Benin R2=29.7 %, p=1e-05; BF R2=22.2 %, p=5e-05; Finland not significant.

ResFinder

Load data

# Modify mapping output file "ARG_genemat.txt" in command line to match sample names in metadata file
## sed 's/BFH38-A_S156/BFH38.A_S156/g' ARG_genemat.txt > mod_ARG_genemat.txt
## sed -i 's/BFH38-B_S157/BFH38.B_S157/g' mod_ARG_genemat.txt
## sed -i 's/BH34-A_S98/BH34.A_S98/g' mod_ARG_genemat.txt
## sed -i 's/BH34-B_S99/BH34.B_S99/g' mod_ARG_genemat.txt

ARG_genemat <-as.matrix(read.table("mod_ARG_genemat.txt", header= T, check.names = F, row.names = 1))

# Save counts without gene names
#write.table(ARG_genemat, "~/Documents/Metagenomes_AMRIWA/R/AMRIWA/RFiles/ARG_genemat_counts", row.names=T, sep = "\t", col.names = T)

#ARG_genemat_counts <- read.delim("~/Documents/Metagenomes_AMRIWA/R/AMRIWA/RFiles/ARG_genemat_counts", row.names=NULL)

#ARG_genemat_counts$row.names<-NULL

Load ResFinder gene lengths if normalized with them

# Modify "resfinder.fasta" file so that only hits remain
## seqkit grep -f ARG_genes.txt resfinder.fasta > filtered_resfinder.fasta
# Check if there are correct number of lines
## grep ">" filtered_resfinder.fasta | wc -l
# Print out the gene lengths of these genes into file "lengths_resfinder.txt"
## cat filtered_resfinder.fasta | awk '$0 ~ ">" {if (NR > 1) {print c;} c=0;printf substr($0,2,100) "\t"; } $0 !~ ">" {c+=length($0);} END { print c; }' > lengths_resfinder.txt
# Reorder in excel to match with file "ARG_genemat_norm"
#lengths_resfinder <-as.matrix(read.table("lengths_resfinder.txt", header= F, check.names = T, row.names = 1))
#colnames(lengths_resfinder) <- c("Length")

Normalization with gene lengths

# Divide by ResFinder hit gene lengths
#ARG_length_norm <- ARG_genemat/lengths_resfinder[, 1]

# Divide by SSU counts and normalize to bacterial 16S rRNA length (1550)
#ARG_genemat_norm <- t(t(ARG_length_norm)/metadata$SSU_counts) * 1550

# Check if correct:
#identical(ARG_genemat[2020, 4]/metadata$SSU_counts[4], ARG_genemat_norm[2020, 4])

# Save and load again to exclude row.names
#write.table(ARG_genemat_norm, "~/Documents/Metagenomes_AMRIWA/R/AMRIWA/RFiles/ARG_genemat_norm.txt", row.names=T, sep = "\t", col.names = T)

#ARG_genemat_norm <- read.delim("~/Documents/Metagenomes_AMRIWA/R/AMRIWA/RFiles/ARG_genemat_norm.txt", row.names=NULL)
#ARG_genemat_norm$row.names<-NULL

Normalize ARG counts to SSU

ARG_genemat_norm <- t(t(ARG_genemat)/metadata$SSU_counts)

# Check if correct:
identical(ARG_genemat[2020, 4]/metadata$SSU_counts[10], ARG_genemat_norm[2020, 4])
## [1] TRUE
# Save and load again to exclude row.names
write.table(ARG_genemat_norm, "~/Documents/Metagenomes_AMRIWA/R/AMRIWA/RFiles/ARG_genemat_norm.txt", row.names=T, sep = "\t", col.names = T)

ARG_genemat_norm <- read.delim("~/Documents/Metagenomes_AMRIWA/R/AMRIWA/RFiles/ARG_genemat_norm.txt", row.names=NULL)
ARG_genemat_norm$row.names<-NULL

Create phyloseq object

# Create tax table
tax_table_resfinder <- read.csv("~/Documents/Metagenomes_AMRIWA/R/AMRIWA/RFiles/tax_table_resfinder.txt", header=FALSE, sep=";")
colnames(tax_table_resfinder) <- c("Gene", "Class")

# Combine to phyloseq object
resfinder_PHY <- phyloseq(otu_table(ARG_genemat_norm, taxa_are_rows = TRUE), sample_data(metadata), 
    tax_table(as.matrix(tax_table_resfinder)))

# Have a quick look
plot_bar(otu_table(resfinder_PHY))

# There are the suspicious samples (BH02, BH27, BH30).

Filter samples

# With low quality (BFH24_S142, BH63_S118, FH10_S171)
resfinder_PHY = subset_samples(resfinder_PHY, name != "BFH24_S142" & name != "BH63_S118" & name != "FH10_S171")

# Feces samples (BH20_S88, BH22_S89, BH24_S90, BH25_S91)
resfinder_PHY_ww = subset_samples(resfinder_PHY, name!="BH20_S88" & name!="BH22_S89" & name!="BH24_S90" & name!="BH25_S91")

# Or alternatively include fecal but exclude suspicious samples
# resfinder_PHY_ast = subset_samples(resfinder_PHY, "BH02_S77" & name != "BH27_S92" & name != "BH30_S94")

# Suspicious samples (very high ARG level) (BH02_S77, BH27_S92, BH30_S94)
resfinder_PHY_clean = subset_samples(resfinder_PHY_ww, name != "BH02_S77" & name != "BH27_S92" & name != "BH30_S94")

# Check if there are taxa that are not present in any of the remaining samples
any(taxa_sums(resfinder_PHY_clean) == 0)
## [1] TRUE
resfinder_PHY_clean <- prune_taxa(taxa_sums(resfinder_PHY_clean) > 0, resfinder_PHY_clean)
any(taxa_sums(resfinder_PHY_clean) == 0)
## [1] FALSE

Ordinate ResFinder mapping results to study wheather the taxonomy can be explained by the variable “country”

# Ordinate and plot data
resfinder_PHY_ord <- ordinate(resfinder_PHY_clean, method = "PCoA", distance = "horn")
p3 <- plot_ordination(resfinder_PHY_clean, resfinder_PHY_ord, color = "country", label = "name")
resfinder.p3 <- p3 + scale_color_manual(values=c("#FF333F", "#35E0F5", "#531592")) + 
    geom_point(colour = "black", pch = 21, size = 2, alpha = 0.5) + stat_ellipse(level = 0.95, linetype = 1) +
    theme_minimal() + labs(title = "ResFinder")
resfinder.p3

# Test significance between all pairs
resfinder_temp <- subset_samples(resfinder_PHY_clean, (country == "Finland" | country == "Benin"))
resfinder_dist <- vegdist(t(otu_table(resfinder_temp)), dist = "horn")
adonis(resfinder_dist ~ country, data = data.frame(sample_data(resfinder_temp), permutations = 9999))
## 
## Call:
## adonis(formula = resfinder_dist ~ country, data = data.frame(sample_data(resfinder_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## country    1    1.5666 1.56661  5.9906 0.11522  0.001 ***
## Residuals 46   12.0295 0.26151         0.88478           
## Total     47   13.5961                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resfinder_temp <- subset_samples(resfinder_PHY_clean, (country == "Burkina Faso" | country == "Benin"))
resfinder_dist <- vegdist(t(otu_table(resfinder_temp)), dist = "horn")
adonis(resfinder_dist ~ country, data = data.frame(sample_data(resfinder_temp), permutations = 9999))
## 
## Call:
## adonis(formula = resfinder_dist ~ country, data = data.frame(sample_data(resfinder_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)   
## country    1    0.7633 0.76329  3.0538 0.03912  0.002 **
## Residuals 75   18.7461 0.24995         0.96088          
## Total     76   19.5094                 1.00000          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resfinder_temp <- subset_samples(resfinder_PHY_clean, (country == "Finland" | country == "Burkina Faso"))
resfinder_dist <- vegdist(t(otu_table(resfinder_temp)), dist = "horn")
adonis(resfinder_dist ~ country, data = data.frame(sample_data(resfinder_temp), permutations = 9999))
## 
## Call:
## adonis(formula = resfinder_dist ~ country, data = data.frame(sample_data(resfinder_temp),      permutations = 9999)) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## country    1    1.4172 1.41723  6.4599 0.12553  0.001 ***
## Residuals 45    9.8725 0.21939         0.87447           
## Total     46   11.2897                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Between Benin and Finland, country explains 11.5 % (p=0.001) of the divergence in the resistome. Between BF and Finland the same is 12.6 % (p=0.001).

Is taxonomic composition is grouped significantly by location inside each country, ResFinder

Benin <- subset_samples(resfinder_PHY_clean, (country == "Benin"))
Benin_dist <- vegdist(t(otu_table(Benin)), dist = "horn")
adonis(Benin_dist ~ location, data = data.frame(sample_data(Benin)), permutations = 99999)
## 
## Call:
## adonis(formula = Benin_dist ~ location, data = data.frame(sample_data(Benin)),      permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## location   4    2.7747 0.69368  3.0722 0.26548  2e-05 ***
## Residuals 34    7.6768 0.22579         0.73452           
## Total     38   10.4516                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
BF <- subset_samples(resfinder_PHY_clean, (country == "Burkina Faso"))
BF_dist <- vegdist(t(otu_table(BF)), dist = "horn")
adonis(BF_dist ~ location, data = data.frame(sample_data(BF)), permutations = 99999)
## 
## Call:
## adonis(formula = BF_dist ~ location, data = data.frame(sample_data(BF)),      permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2  Pr(>F)   
## location   4    1.6955 0.42388  2.1197 0.20441 0.00197 **
## Residuals 33    6.5990 0.19997         0.79559           
## Total     37    8.2945                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Finland <- subset_samples(resfinder_PHY_clean, (country == "Finland"))
Finland_dist <- vegdist(t(otu_table(Finland)), dist = "horn")
adonis(Finland_dist ~ location, data = data.frame(sample_data(Finland)), permutations = 99999)
## 
## Call:
## adonis(formula = Finland_dist ~ location, data = data.frame(sample_data(Finland)),      permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)
## location   3   0.71217 0.23739   1.371 0.45133 0.1393
## Residuals  5   0.86577 0.17315         0.54867       
## Total      8   1.57794                 1.00000
# In Benin and BF the grouping by location has R2 values like 26.5 % (p=4e-05) and 20.4 % (p=0.00175). In Finland no significant correlation can be proven.

Diversity indexes by Metaxa2

# Take a look at the indexes
metaxa_tab <-microbiome::alpha(metaxa_PHY_clean, index = "all")
## Observed richness
## Other forms of richness
## Diversity
## Evenness
## Dominance
## Rarity
kable(head(metaxa_tab))

# Create data table
metaxa.meta <- meta(metaxa_PHY_clean)
kable(head(metaxa.meta))

# Add indexes to data table
## Shannon
metaxa.meta$diversity_shannon <- metaxa_tab$diversity_shannon
## Gini Simpson
metaxa.meta$diversity_gini_simpson <- metaxa_tab$diversity_gini_simpson
## Inverse Simpson
metaxa.meta$diversity_inverse_simpson <- metaxa_tab$diversity_inverse_simpson

# Check whether the distribution of the diversity is normal
hist(metaxa.meta$diversity_shannon)
shapiro.test(metaxa.meta$diversity_shannon) # Does not fit the assumption of normal distribution > wilcox test.
qqnorm(metaxa.meta$diversity_shannon)

hist(metaxa.meta$diversity_gini_simpson)
shapiro.test(metaxa.meta$diversity_gini_simpson) # Does not fit the assumption of normal distribution > wilcox test.
qqnorm(metaxa.meta$diversity_gini_simpson)

hist(metaxa.meta$diversity_inverse_simpson)
shapiro.test(metaxa.meta$diversity_inverse_simpson)
qqnorm(metaxa.meta$diversity_inverse_simpson)

# Create a list of pairwise comparisons
div.country <- levels(metaxa.meta$country)
country.pairs <- combn(seq_along(div.country), 2, simplify = FALSE, FUN = function(i)div.country[i])
print(country.pairs)

# For inverse simpson index, data fullfills the assumption for normal distribution. For that t-test will be apllied. For the other too wilcox test will be used to test the significance.

Diversity plots, Metaxa2

# Shannon
p4 <- ggviolin(metaxa.meta, x = "country", y = "diversity_shannon",
 add = "boxplot", fill = "country", palette = c("#FF333F", "#35E0F5", "#531592")) 
p4 <- p4 + stat_compare_means(comparisons = country.pairs, method = "wilcox.test") 
print(p4)

# Gini Simpson
p5 <- ggviolin(metaxa.meta, x = "country", y = "diversity_gini_simpson",
 add = "boxplot", fill = "country", palette = c("#FF333F", "#35E0F5", "#531592")) 
p5 <- p5 + stat_compare_means(comparisons = country.pairs, method = "wilcox.test") 
print(p5)

# Inverse Simpson
p6 <- ggviolin(metaxa.meta, x = "country", y = "diversity_inverse_simpson",
 add = "boxplot", fill = "country", palette = c("#FF333F", "#35E0F5", "#531592")) 
p6 <- p6 + stat_compare_means(comparisons = country.pairs, method = "t.test") 
print(p6)

# In Finnish samples seems to have significantly lower alpha diversity in taxonomy according to Metaxa2 compared to Benin and Burkina Faso.

Diversity indexes by Metaphlan3

# Shannon
## Take a look at the indexes
metaphlan_tab <-microbiome::alpha(metaphlan_PHY_clean, index = "diversity_shannon")
## Observed richness
## Other forms of richness
## Diversity
## Evenness
## Dominance
## Rarity
kable(head(metaphlan_tab))

## Create data table
metaphlan.meta <- meta(metaphlan_PHY_clean)
kable(head(metaphlan.meta))

## Add indexes to data table
metaphlan.meta$diversity_shannon <- metaphlan_tab$diversity_shannon

# Check whether the distribution of the diversity is normal
hist(metaphlan.meta$diversity_shannon)
shapiro.test(metaphlan.meta$diversity_shannon)
qqnorm(metaphlan.meta$diversity_shannon)

# Gini Simpson
## Take a look at the indexes
metaphlan_tab <-microbiome::alpha(metaphlan_PHY_clean, index = "diversity_gini_simpson")
## Observed richness
## Other forms of richness
## Diversity
## Evenness
## Dominance
## Rarity
kable(head(metaphlan_tab))

## Add indexes to data table
metaphlan.meta$diversity_gini_simpson <- metaphlan_tab$diversity_gini_simpson

# Check whether the distribution of the diversity is normal
hist(metaphlan.meta$diversity_gini_simpson)
shapiro.test(metaphlan.meta$diversity_gini_simpson)
qqnorm(metaphlan.meta$diversity_gini_simpson)

# Create a list of pairwise comparisons
div.country <- levels(metaphlan.meta$country)
country.pairs <- combn(seq_along(div.country), 2, simplify = FALSE, FUN = function(i)div.country[i])
print(country.pairs)

# Inverse Simpson
## Take a look at the indexes
metaphlan_tab <-microbiome::alpha(metaphlan_PHY_clean, index = "diversity_inverse_simpson")
## Observed richness
## Other forms of richness
## Diversity
## Evenness
## Dominance
## Rarity
kable(head(metaphlan_tab))

## Add indexes to data table
metaphlan.meta$diversity_inverse_simpson <- metaphlan_tab$diversity_inverse_simpson

# Check whether the distribution of the diversity is normal
hist(metaphlan.meta$diversity_inverse_simpson)
shapiro.test(metaphlan.meta$diversity_inverse_simpson) 
qqnorm(metaphlan.meta$diversity_inverse_simpson)

Diversity plots, Metaphlan3

p7 <- ggviolin(metaphlan.meta, x = "country", y = "diversity_shannon",
 add = "boxplot", fill = "country", palette = c("#FF333F", "#35E0F5", "#531592")) 
metaphlan.p7 <- p7 + stat_compare_means(comparisons = country.pairs, method = "wilcox.test")
metaphlan.p7

p8 <- ggviolin(metaphlan.meta, x = "country", y = "diversity_gini_simpson",
 add = "boxplot", fill = "country", palette = c("#FF333F", "#35E0F5", "#531592")) 
metaphlan.p8 <- p8 + stat_compare_means(comparisons = country.pairs, method = "wilcox.test") 
metaphlan.p8 

p9 <- ggviolin(metaphlan.meta, x = "country", y = "diversity_inverse_simpson",
 add = "boxplot", fill = "country", palette = c("#FF333F", "#35E0F5", "#531592")) 
metaphlan.p9 <- p9 + stat_compare_means(comparisons = country.pairs, method = "t.test") 
metaphlan.p9

# According to metaphlan there are no significant differences between the alpha diversities of samples from different countries.

Diversity indexes by ResFinder

# Shannon
## Take a look at the indexes
resfinder_tab <-microbiome::alpha(resfinder_PHY_clean, index = "diversity_shannon")
## Observed richness
## Other forms of richness
## Diversity
## Evenness
## Dominance
## Rarity
kable(head(resfinder_tab))

## Create data table
resfinder.meta <- meta(resfinder_PHY_clean)
kable(head(resfinder.meta))

## Add indexes to data table
resfinder.meta$diversity_shannon <- resfinder_tab$diversity_shannon

# Check whether the distribution of the diversity is normal
hist(resfinder.meta$diversity_shannon)
shapiro.test(resfinder.meta$diversity_shannon) # Does not fit the assumption of normal distribution > wilcox test.
qqnorm(resfinder.meta$diversity_shannon)

# Create a list of pairwise comparisons
div.country <- levels(resfinder.meta$country)
country.pairs <- combn(seq_along(div.country), 2, simplify = FALSE, FUN = function(i)div.country[i])
print(country.pairs)

# Gini Simpson
## Take a look at the indexes
resfinder_tab <-microbiome::alpha(resfinder_PHY_clean, index = "diversity_gini_simpson")
## Observed richness
## Other forms of richness
## Diversity
## Evenness
## Dominance
## Rarity
kable(head(resfinder_tab))

## Add indexes to data table
resfinder.meta$diversity_gini_simpson <- resfinder_tab$diversity_gini_simpson

# Check whether the distribution of the diversity is normal
hist(resfinder.meta$diversity_gini_simpson)
shapiro.test(resfinder.meta$diversity_gini_simpson) # Does not fit the assumption of normal distribution > wilcox test.
qqnorm(resfinder.meta$diversity_gini_simpson)

# Inverse Simpson
## Take a look at the indexes
resfinder_tab <-microbiome::alpha(resfinder_PHY_clean, index = "diversity_inverse_simpson")
## Observed richness
## Other forms of richness
## Diversity
## Evenness
## Dominance
## Rarity
kable(head(resfinder_tab))

## Add indexes to data table
resfinder.meta$diversity_inverse_simpson <- resfinder_tab$diversity_inverse_simpson

# Check whether the distribution of the diversity is normal
hist(resfinder.meta$diversity_inverse_simpson)
shapiro.test(resfinder.meta$diversity_inverse_simpson) 
qqnorm(resfinder.meta$diversity_inverse_simpson)

Diversity plots, ResFinder

p10 <- ggviolin(resfinder.meta, x = "country", y = "diversity_shannon",
 add = "boxplot", fill = "country", palette = c("#FF333F", "#35E0F5", "#531592")) 
resfinder.p10 <- p10 + stat_compare_means(comparisons = country.pairs, method = "wilcox.test")
resfinder.p10

p11 <- ggviolin(resfinder.meta, x = "country", y = "diversity_gini_simpson",
 add = "boxplot", fill = "country", palette = c("#FF333F", "#35E0F5", "#531592")) 
resfinder.p11 <- p11 + stat_compare_means(comparisons = country.pairs, method = "wilcox.test") 
resfinder.p11

p12 <- ggviolin(resfinder.meta, x = "country", y = "diversity_inverse_simpson",
 add = "boxplot", fill = "country", palette = c("#FF333F", "#35E0F5", "#531592")) 
resfinder.p12 <- p12 + stat_compare_means(comparisons = country.pairs, method = "t.test") 
resfinder.p12

# The resistome seems less complex in Finnish samples than in Burkinabe and Beninise.

Diversity of resistomes by individual samples

nb.cols <- 14
mycolors <- colorRampPalette(brewer.pal(11, "Spectral"))(nb.cols)

loc <- plot_richness(resfinder_PHY_clean, measures = c("Shannon", "Simpson"), 
                     color="location", shape = "country")

resfinder.in <- loc + scale_fill_manual(values=mycolors) + 
  geom_boxplot() +
  geom_point(size=3) + theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, size =5))
resfinder.in

# Taxonomic composition by Metaxa2 using microbiome package

# Use the unfiltered data this time to see the differences in fecal samples etc
metaxa_rel_PHY_orig <- transform_sample_counts(metaxa_PHY, function(x) x/sum(x))

# Load data
metaxa.com <- metaxa_rel_PHY_orig
taxic_metaxa <- as.data.frame(metaxa.com@tax_table) 

# Add OTU ids
taxic_metaxa$OTU <- rownames(taxic_metaxa) 
colnames(taxic_metaxa)
## [1] "Domain" "Phylum" "Class"  "Order"  "Family" "Genus"  "OTU"
# Convert into a matrix.
taxmat_metaxa <- as.matrix(taxic_metaxa)

# Convert into phyloseq compatible file.
new.tax_metaxa <- tax_table(taxmat_metaxa)  

# Incroporate into new phyloseq object
tax_table(metaxa.com) <- new.tax_metaxa

# Class level
metaxa_top5class <- aggregate_top_taxa(metaxa.com, top = 5, "Class")

metaxa_top5class.p13 <- plot_composition(metaxa_top5class, group_by = "country") + 
  theme(legend.position = "bottom") + theme_classic() +
  theme(axis.text.x = element_text(angle = 90, size = 8), axis.title = element_blank()) +
    ggtitle("Relative abundance of top 5 classes, Metaxa2")

metaxa_top5class.p13 + 
    scale_fill_manual(values = c("#1B9E77","#E6AB02","#7570B3","#E7298A","#66A61E", "#FFFFFF"))

# Technicval replicates seem highly similar to each other. Fecal samples have different diversity in the class taxonomy: more Closridia and Betaproteobacteria than Gammaproteobacteria and no Epsilonproteobacteria when comparing to ww samples. Of the suscpicious samples especially in BH02 the Gammaproteobacteria are dominating. This would make sense if the sample was from CLED plate which is selective for Enterobacteriaceae species. 

Top 15 genera merged by location, Metaxa2

metaxa_PHY_genus <- tax_glom(metaxa_rel_PHY, "Genus")

metaxa_PHY_genus_abun <- prune_taxa(names(sort(taxa_sums(metaxa_PHY_genus), TRUE)[1:15]), 
    metaxa_PHY_genus)

# Transform_1
metaxa_PHY_genus_abun_trans <- transform_sample_counts(metaxa_PHY_genus_abun, function(x) x / sum(x))

# Merge by location
metaxa_mrg_loc <- merge_samples(metaxa_PHY_genus_abun_trans, "location")

# Transform_2
metaxa_mrg_loc_trans <- transform_sample_counts(metaxa_mrg_loc, function(x) x / sum(x))

# Normalize by sample number / location
otu_table(metaxa_mrg_loc_trans) <- otu_table(metaxa_mrg_loc_trans)[, ]/as.matrix(table(sample_data(metaxa_PHY)$location))[, 
    1]

# Set colors
nb.cols <- 15
mycolors <- colorRampPalette(brewer.pal(11, "Spectral"))(nb.cols)

# Facet labs
country.labs <- c("1" = "Benin", "2" = "Burkina Faso", "3" = "Finland")

# Plot
metaxa.p14 <- plot_bar(metaxa_mrg_loc_trans, fill = "Genus", title = "15 most abundant genera, Metaxa2") + 
  facet_grid(~country, scales = "free_x", space = "free_x", labeller = labeller(country = country.labs)) +
  theme_classic() +
  theme(axis.text.x = element_text(angle=55, hjust=1, size = 10),
        axis.title.x = element_blank()) +
        ylab("Merged relative abundance (%)") + xlab("") +
  scale_fill_manual(values=mycolors) + geom_bar(stat="identity") +
  guides(fill = guide_legend(label.theme = element_text(size = 8), keywidth = 1.5, keyheight = 1.5, title = "Genus"))

Top 15 genera merged by location, Metaphlan3

metaphlan_PHY_genus <- tax_glom(metaphlan_PHY_clean, "Genus")

metaphlan_PHY_genus_abun <- prune_taxa(names(sort(taxa_sums(metaphlan_PHY_genus), TRUE)[1:15]), 
    metaphlan_PHY_genus)

# Transform_1
metaphlan_PHY_genus_abun_trans <- transform_sample_counts(metaphlan_PHY_genus_abun, function(x) x / sum(x))

# Merge by location
metaphlan_mrg_loc <- merge_samples(metaphlan_PHY_genus_abun_trans, "location")

# Transform_2
metaphlan_mrg_loc_trans <- transform_sample_counts(metaphlan_mrg_loc, function(x) x / sum(x))

# Normalize by sample number / location
otu_table(metaphlan_mrg_loc_trans) <- otu_table(metaphlan_mrg_loc_trans)[, ]/as.matrix(table(sample_data(metaphlan_PHY)$location))[, 
    1]

# Set colors
nb.cols <- 15
mycolors <- colorRampPalette(brewer.pal(11, "Spectral"))(nb.cols)
# Facet labs
country.labs <- c("1" = "Benin", "2" = "Burkina Faso", "3" = "Finland")

# Plot
metaphlan.p16 <- plot_bar(metaphlan_mrg_loc_trans, fill = "Genus", title = "15 most abundant genera, Metaphlan3") + 
  facet_grid(~country, scales = "free_x", space = "free_x", labeller = labeller(country = country.labs)) +
  theme_classic() +
  theme(axis.text.x = element_text(angle=55, hjust=1, size = 10),
        axis.title.x = element_blank()) +
        ylab("Relative abundance (%)") + xlab("") +
  scale_fill_manual(values=mycolors) + geom_bar(stat="identity") +
  guides(fill = guide_legend(label.theme = element_text(size = 8), keywidth = 1.5, keyheight = 1.5, title = "Genus"))

# Plot both
metaxa.p14

metaphlan.p16

# The results seem coherent between the two methods. Aeromonas sp are relatively common in Finnish samples.

Heatmap of Metaxa2 results

# Let's use again the unfiltered data
metaphlan_PHY_genus <- tax_glom(metaphlan_PHY, "Genus")
heat.metaphlan <- plot_taxa_heatmap(metaphlan_PHY_genus,
                                 subset.top = 50,
                                 VariableA = "country",
                                 heatcolors = rev(brewer.pal(12, "GnBu")),
                                 transformation = "log10",
                                 cluster_cols = T)
heat.metaphlan

# The plot could be edited to have bigger font size and have more annotations (location, sample type).

Sum of the relative abundances of ResFinder results by location

# Average 16S counts by location
mean16S_counts <- mean(metadata$SSU_counts, invert = TRUE)

lo_data <- data.frame(SUM = round(mean16S_counts * (sample_sums(otu_table(resfinder_PHY_clean)))), 
    location = sample_data(resfinder_PHY_clean)$location)

plot(lo_data$SUM~lo_data$location, cex.axis=0.5, srt=45, las=2, xlab=NULL)

# Testing models to support the data
## Linear
model.ln<-lm(SUM~location, data=lo_data)
summary(model.ln)
shapiro.test(lo_data$SUM)
plot(model.ln)
## Warning: not plotting observations with leverage one:
##   69

## Warning: not plotting observations with leverage one:
##   69
# The p value is > 0.05 and differs significantly from normal distribution

## Log
model.logln<-lm(log(SUM)~location, data=lo_data)
summary(model.logln)
shapiro.test(log(lo_data$SUM))
plot(model.logln)
## Warning: not plotting observations with leverage one:
##   69

## Warning: not plotting observations with leverage one:
##   69
# The p value is > 0.05 and differs significantly from normal distribution

## Poisson
model.pois = glm(SUM ~ location , data = lo_data, family = poisson)
summary(model.pois)
plot(model.pois)
## Warning: not plotting observations with leverage one:
##   69

## Warning: not plotting observations with leverage one:
##   69
## Quasipoisson
model.qpois<-glm(SUM~location, data=lo_data, family="quasipoisson")
summary(model.qpois)
plot(model.qpois)
## Warning: not plotting observations with leverage one:
##   69

## Warning: not plotting observations with leverage one:
##   69
## Negative binomial
model.nb <- glm.nb(SUM~location, data=lo_data)
summary(model.nb)
plot(model.nb)
## Warning: not plotting observations with leverage one:
##   69

## Warning: not plotting observations with leverage one:
##   69
# Summarize and compare models
data.frame(linear=coef(model.ln),
                 loglinear=exp(coef(model.logln)),
                 poisson=exp(coef(model.pois)),
                 qpoisson=exp(coef(model.qpois)),
                 negbin=exp(coef(model.nb)))
# Critical value
qchisq(0.95, df.residual(model.pois))

deviance(model.ln)
deviance(model.logln)
deviance(model.pois)
deviance(model.qpois)
deviance(model.nb)

# None of the models fits the data. However, negative binominal model gets the closest.
# Plot using neg. binom. model
fit <- glm.nb(SUM ~ location, data = lo_data, link = log)

lo_data <- cbind(lo_data, Mean = predict(fit, newdata = lo_data, type = "response"), SE = predict(fit, newdata = lo_data, type = "response", se.fit = T)$se.fit)

nb.cols <- 14
mycolors <- colorRampPalette(brewer.pal(11, "Spectral"))(nb.cols)

ARG.sum.lo <- ggplot(lo_data, aes(x = location, y = Mean)) +
    geom_line() + geom_jitter(data = lo_data, aes(x = location, y = SUM, color = location), 
    size = 2.3, alpha = 0.5, width = 0.3) + 
    geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.6, lwd = 0.6) + 
    geom_point(size = 0.9) +
    scale_fill_manual(values = mycolors) +
    theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    labs(y = "Sum abundance/16S", 
    x = "") + guides(color = FALSE, alpha = FALSE) + labs(title = "ARGs") +
  scale_y_continuous(trans = "log10")
ARG.sum.lo

Extract outliers

#lo_data <- data.frame(SUM = round(mean16S_counts * (sample_sums(otu_table(resfinder_PHY)))), location = sample_data(resfinder_PHY)$location)

#boxplot(lo_data$SUM)
#boxplot(lo_data$SUM, plot=FALSE)$out
#outliers <- boxplot(lo_data$SUM, plot=FALSE)$out
#print(outliers)

# Where they at
#lo_data[which(lo_data$SUM %in% outliers),]
# Remove
#lo_data <- lo_data[-which(lo_data$SUM %in% outliers),]
#boxplot(lo_data$SUM)

# New  plot
#fit <- glm.nb(SUM ~ location, data = lo_data, link = log)

#lo_data <- cbind(lo_data, Mean = predict(fit, newdata = lo_data, type = "response"), SE = predict(fit, newdata =lo_data, type = "response", se.fit = T)$se.fit)

#nb.cols <- 14
#mycolors <- colorRampPalette(brewer.pal(8, "Dark2"))(nb.cols)

#ARG.sum.lo_outl <- ggplot(lo_data, aes(x = location, y = Mean)) +
    #scale_fill_manual(values = mycolors) +
    #geom_line() + geom_jitter(data = lo_data, aes(x = location, y = SUM, color = location), 
    #size = 2.3, alpha = 0.5, width = 0.3) + 
    #geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.6, lwd = 0.6) + 
    #geom_point(size = 0.9) +
    #theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    #labs(y = "Sum abundance/16S", x = "") + guides(color = FALSE, alpha = FALSE) + labs(title = "ARGs") + scale_y_log10()
#ARG.sum.lo_outl

Check if significant

lo_data <- data.frame(SUM = round(mean16S_counts * (sample_sums(otu_table(resfinder_PHY_clean)))), 
    location = sample_data(resfinder_PHY_clean)$location)

# When outliers removed use:
#lo_data[which(lo_data$SUM %in% outliers),]
#lo_data <- lo_data[-which(lo_data$SUM %in% outliers),]

fit <- glm.nb(SUM ~ location, data = lo_data, link = log)

# Tukey's post hoc test
glht.mod <- glht(fit, mcp(location = "Tukey"))
print(summary(glht(glht.mod)))
## Warning in chkdots(...): Argument(s) 'complete' passed to '...' are ignored

## Warning in chkdots(...): Argument(s) 'complete' passed to '...' are ignored
## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps
# Significant comparisons:
#Clinique SOUKA - Central clinic of Calavi == 0                                        -3.955    <0.01 ** 
#Helsinki University Hospital - Central clinic of Calavi == 0                          -3.378   0.0399 *  
#Hopital de Zone Calavi - Central clinic of Calavi == 0                                -4.927    <0.01 ***
#Savalou area - Central clinic of Calavi == 0                                          -8.032    <0.01 ***
#Source de Vie, Ouagadougou - Central clinic of Calavi == 0                            -3.920    <0.01 ** 
#Savalou area - Central Finland Health Care District == 0                              -3.939    <0.01 ** 
#Savalou area - Clinique SOUKA == 0                                                    -4.638    <0.01 ***
#Savalou area - Helsinki University Hospital == 0                                      -3.826    <0.01 ** 
#Savalou area - Hopital de Zone Calavi == 0                                            -4.394    <0.01 ***
#Yalgado teaching hospital of Ouagadougou - Hopital de Zone Calavi == 0                 3.623   0.0171 *  
#Savalou area - Hospital Saint Jean == 0                                               -6.079    <0.01 ***
#Savalou area - Koudougou Hospital == 0                                                -6.427    <0.01 ***
#Savalou area - Menontin hospital == 0                                                 -6.546    <0.01 ***
#Savalou area - Nanoro Hospital == 0                                                   -3.868    <0.01 ** 
#Source de Vie, Ouagadougou - Savalou area == 0                                         3.308   0.0492 *  
#Yalgado teaching hospital of Ouagadougou - Savalou area == 0                           7.406    <0.01 ***

Sum of the relative abundances of ResFinder results by country

# Average 16S counts bu location
mean16S_counts <- mean(metadata$SSU_counts, invert = TRUE)

co_data <- data.frame(SUM = round(mean16S_counts * (sample_sums(otu_table(resfinder_PHY_clean)))), 
    country = sample_data(resfinder_PHY_clean)$country)

plot(co_data$SUM~co_data$country, cex.axis=0.5, srt=45, las=2, xlab=NULL)

# Testing models to support the data
## Linear
model.ln<-lm(SUM~country, data=co_data)
summary(model.ln)
shapiro.test(lo_data$SUM)
plot(model.ln)

# The p value is > 0.05 and differs significantly from normal distribution

## Log
model.logln<-lm(log(SUM)~country, data=co_data)
summary(model.logln)
shapiro.test(log(co_data$SUM))
plot(model.logln)

# Data is not normally distributed.

## Poisson
model.pois = glm(SUM~country , data = co_data, family = poisson)
summary(model.pois)
plot(model.pois)

## Quasipoisson
model.qpois<-glm(SUM~country, data=co_data, family="quasipoisson")
summary(model.qpois)
plot(model.qpois)

## Negative binomial
model.nb <- glm.nb(SUM~country, data=co_data)
summary(model.nb)
plot(model.nb)

1/model.nb$theta
-2*(logLik(model.pois)-logLik(model.nb))

# Summarize models
data.frame(linear=coef(model.ln),
                 loglinear=exp(coef(model.logln)),
                 poisson=exp(coef(model.pois)),
                 qpoisson=exp(coef(model.qpois)),
                 negbin=exp(coef(model.nb)))
# Critical value
qchisq(0.95, df.residual(model.nb))

deviance(model.ln)
deviance(model.logln)
deviance(model.pois)
deviance(model.qpois)
deviance(model.nb)

# None of the models fits the data. However, negative binominal gets the closest.
# Plot using neg. binom. model
co_data <- data.frame(SUM = round(mean16S_counts * (sample_sums(otu_table(resfinder_PHY_clean)))), 
    country = sample_data(resfinder_PHY_clean)$country)

fit <- glm.nb(SUM ~ country, data = co_data, link = log)

co_data <- cbind(co_data, Mean = predict(fit, newdata = co_data, type = "response"), SE = predict(fit, newdata = co_data, type = "response", se.fit = T)$se.fit)

ARG.sum.co <- ggplot(co_data, aes(x = country, y = Mean)) +
        scale_color_manual(values=c("#FF333F", "#35E0F5", "#531592")) +
    geom_line() + geom_jitter(data = co_data, aes(x = country, y = SUM, color = country), 
    size = 2.3, alpha = 0.5, width = 0.3) + 
    geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.6, lwd = 0.6) + 
    geom_point(size = 0.9) +
    theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    labs(y = "Sum abundance/16S", 
    x = "") + guides(color = FALSE, alpha = FALSE) + labs(title = "ARGs") + scale_y_log10()
ARG.sum.co

Extract outliers

#co_data <- data.frame(SUM = round(mean16S_counts * (sample_sums(otu_table(resfinder_PHY)))), country = sample_data(resfinder_PHY)$country)

#boxplot(co_data$SUM)
#boxplot(co_data$SUM, plot=FALSE)$out
#outliers <- boxplot(co_data$SUM, plot=FALSE)$out
#print(outliers)

# Where they at
#co_data[which(co_data$SUM %in% outliers),]
# Remove
#co_data <- co_data[-which(co_data$SUM %in% outliers),]
#boxplot(co_data$SUM)

# New  plot
#fit <- glm.nb(SUM ~ country, data = co_data, link = log)

#co_data <- cbind(co_data, Mean = predict(fit, newdata = co_data, type = "response"), SE = predict(fit, newdata = co_data, type = "response", se.fit = T)$se.fit)

#ARG.sum.co_outl <- ggplot(co_data, aes(x = country, y = Mean)) +
    #scale_color_manual(values=c("#FF333F", "#35E0F5", "#531592")) +
    #geom_line() + geom_jitter(data = co_data, aes(x = country, y = SUM, color = country), 
    #size = 2.3, alpha = 0.5, width = 0.3) + 
    #geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.6, lwd = 0.6) + 
    #geom_point(size = 0.9) +
    #theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    #labs(y = "Sum abundance/16S", x = "") + guides(color = FALSE, alpha = FALSE) + labs(title = "ARGs") + scale_y_log10()
#ARG.sum.co_outl

Check if significant

co_data <- data.frame(SUM = round(mean16S_counts * (sample_sums(otu_table(resfinder_PHY_clean)))), 
    country = sample_data(resfinder_PHY_clean)$country)

# Exclude outliers
#co_data[which(co_data$SUM %in% outliers),]
#co_data <- co_data[-which(co_data$SUM %in% outliers),]

fit <- glm.nb(SUM ~ country, data = co_data, link = log)

# Tukey's post hoc test
glht.mod <- glht(fit, mcp(country = "Tukey"))
summary(glht(glht.mod))
## Warning in chkdots(...): Argument(s) 'complete' passed to '...' are ignored

## Warning in chkdots(...): Argument(s) 'complete' passed to '...' are ignored
# No significant results. Maybe there would be if only actual hospital ww samples were included? Then Savalou environment would be excluded with its low sum abundance.

ARG Classes, sum relative abundance by location

resfinder_PHY_Class <- tax_glom(resfinder_PHY_clean, taxrank = "Class")

# All classes
resfinder_PHY_Class_abund <- prune_taxa(names(sort(taxa_sums(resfinder_PHY_Class), TRUE)), 
    resfinder_PHY_Class)

# Normalize by number of samples in each country
otu_table(resfinder_PHY_Class_abund) <-   (otu_table(resfinder_PHY_Class_abund)[,]/as.matrix(table(sample_data(resfinder_PHY_Class_abund)$country))[, 1])
## Warning in (if (isS4(e1)) e1@.Data else e1)/if (isS4(e2)) e2@.Data else e2:
## longer object length is not a multiple of shorter object length
p17<-plot_bar(resfinder_PHY_Class_abund, x="name", fill="Class")
resfinder.p17 <- p17 + facet_grid(~ location, scales = "free", space = "free") + 
  ggtitle("Sum relative abundance of ARGs by classes/16S") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle=90, hjust=1, size = 8),
        axis.title.x = element_blank()) +
  guides(fill = guide_legend(label.theme = element_text(size = 8), keywidth = 1.2, keyheight = 1.2))

# Remove OTU separators
resfinder.p17 + geom_bar(aes(color=Class, fill=Class), stat="identity", position="stack")

# Top 12 ARG genes, sum abundance

resfinder_PHY_Gene <- tax_glom(resfinder_PHY_clean, taxrank = "Gene")

resfinder_PHY_Gene_abund <- prune_taxa(names(sort(taxa_sums(resfinder_PHY_Gene), TRUE)[1:12]), 
    resfinder_PHY_Gene)

# Normalize by number of samples in each country
otu_table(resfinder_PHY_Gene_abund) <-   (otu_table(resfinder_PHY_Gene_abund)[,]/as.matrix(table(sample_data(resfinder_PHY_Gene_abund)$country))[, 1])

p18<-plot_bar(resfinder_PHY_Gene_abund, x="name", fill="Gene")
resfinder.p18 <- p18 + facet_grid(~ location, scales = "free", space = "free") + 
  ggtitle("Sum relative abundances of ARGs/16S") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle=90, hjust=1, size = 8),
        axis.title.x = element_blank()) +
  guides(fill = guide_legend(label.theme = element_text(size = 8), keywidth = 1.2, keyheight = 1.2))

# Remove OTU separators
resfinder.p18 + geom_bar(aes(color=Gene, fill=Gene), stat="identity", position="stack")

# Betalactams

resfinder_PHY_betalactams = subset_taxa(resfinder_PHY_clean, Class=="Betalactam")

resfinder_PHY_betalactams_Gene <- tax_glom(resfinder_PHY_betalactams, "Gene")

# Top12
resfinder_PHY_betalactams_Gene_abund <- prune_taxa(names(sort(taxa_sums(resfinder_PHY_betalactams_Gene), TRUE)[1:12]), 
    resfinder_PHY_betalactams_Gene)

# Normalize by sample number in each country
otu_table(resfinder_PHY_betalactams_Gene_abund) <- (otu_table(resfinder_PHY_betalactams_Gene_abund)[, ]/as.matrix(table(sample_data(resfinder_PHY_betalactams_Gene_abund)$country))[, 
    1])

p19<-plot_bar(resfinder_PHY_betalactams_Gene_abund, x="name", fill="Gene")
resfinder.p19 <- p19 + facet_grid(~ location, scales = "free", space = "free") + 
  ggtitle("Sum of relative abundances of top 12 betalactams/16S") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle=90, hjust=1, size = 8),
        axis.title.x = element_blank()) +
  guides(fill = guide_legend(label.theme = element_text(size = 8), keywidth = 1.2, keyheight = 1.2))

# Remove OTU separators
resfinder.p19 + geom_bar(aes(color=Gene, fill=Gene), stat="identity", position="stack")